from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from tqdm.auto import tqdm
df = pd.read_csv('https://raw.githubusercontent.com/wsovine/data/main/nhl_moneyline_model_logistic_regression_2022_season.csv')
df.sort_values('game_date', inplace=True)
df.reset_index(inplace=True, drop=True)
X = df.copy()
X = X.drop(['home', 'away', 'game_date', 'season',
'home_goals', 'away_goals', 'home_win'], axis=1)
y = df.home_win.values
print(X.shape)
tss = TimeSeriesSplit(n_splits=25)
(1316, 92)
Functions to scale predictors and add one-hot encode categorical variables
def scale(X_train, X_test) -> (pd.DataFrame, pd.DataFrame):
scaler = StandardScaler()
X_train.loc[:,:] = scaler.fit_transform(X_train)
X_test.loc[:,:] = scaler.transform(X_test)
return (X_train, X_test)
def dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
home_dummies = pd.get_dummies(original_df.home, prefix='home')
X_train = X_train.join(home_dummies, how='inner')
X_test = X_test.join(home_dummies, how='inner')
away_dummies = pd.get_dummies(original_df.away, prefix='away')
X_train = X_train.join(away_dummies, how='inner')
X_test = X_test.join(away_dummies, how='inner')
return (X_train, X_test)
def scale_and_dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
X_train, X_test = scale(X_train, X_test)
X_train, X_test = dummy(X_train, X_test, original_df)
return (X_train, X_test)
Functions for calculations that involve the odds and calculating profit
# This function converts american odds to win probability
def odds_to_profit(odds, u = 1):
if odds < 0:
profit = -100 / odds
else:
profit = odds / 100
return profit * u
def odds_to_loss(odds, u = 1):
if odds < 0:
loss = odds / -100
else:
loss = 100 / odds
return -loss * u
def odds_to_prob(odds):
if odds < 0:
prob = odds / (odds - 100)
else:
prob = 100 / (odds + 100)
return prob
def odds_to_dec(odds):
if odds < 0:
dec = (100 / -odds)
else:
dec = (odds / 100)
return dec + 1
def profitize(df, aggregated=True, kelly_scale=100):
df['away_win'] = np.where(df.home_win, 0, 1)
df['home_decimal_odds'] = df.home_consensus.apply(lambda o: odds_to_dec(o))
df['away_decimal_odds'] = df.away_consensus.apply(lambda o: odds_to_dec(o))
# Determine the profit or loss for a bet on either team in each matchup
df['bet_on_home'] = np.where(
df.home_prob > df.home_implied_prob, 1, 0
)
df['bet_on_away'] = np.where(
df.away_prob > df.away_implied_prob, 1, 0
)
# Flat 1u bet
df['home_profit_if_win'] = df.home_consensus.apply(lambda o: odds_to_profit(o))
df['away_profit_if_win'] = df.away_consensus.apply(lambda o: odds_to_profit(o))
df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, -1)
df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, -1)
df['flat_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
# Bet to win 1u
df['home_loss_if_lose'] = df.home_consensus.apply(lambda o: odds_to_loss(o))
df['away_loss_if_lose'] = df.away_consensus.apply(lambda o: odds_to_loss(o))
df['home_pnl_if_bet'] = np.where(df.home_win, 1, df.home_loss_if_lose)
df['away_pnl_if_bet'] = np.where(df.away_win, 1, df.away_loss_if_lose)
df['win_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
# Kelly Criterion
df['home_kelly_bet'] = np.where(
df.home_prob > df.home_implied_prob,
(df.home_prob - df.away_prob / df.home_decimal_odds) * kelly_scale,
0)
df['away_kelly_bet'] = np.where(
df.away_prob > df.away_implied_prob,
(df.away_prob - df.home_prob / df.away_decimal_odds) * kelly_scale,
0)
df['home_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.home_consensus, r.home_kelly_bet), axis=1)
df['away_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.away_consensus, r.away_kelly_bet), axis=1)
df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, df.home_kelly_bet * -1)
df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, df.away_kelly_bet * -1)
df['kelly_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
if aggregated:
return df.flat_1u_pnl.sum(), df.win_1u_pnl.sum(), df.kelly_pnl.sum()
return df.flat_1u_pnl, df.win_1u_pnl, df.kelly_pnl
results = []
for i, (train_index, test_index) in enumerate(tss.split(X)):
X_test = X.iloc[test_index]
y_test = y[test_index]
y_pred = np.where(X_test.home_implied_prob >= .5, 1, 0)
y_prob = X_test.home_implied_prob
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
results.append({
# 'Fold': i,
'Accuracy': accuracy,
'ROC AUC': roc_auc
})
df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
Average Accuracy: 0.5856 Average ROC AUC: 0.6478545892291949
| Accuracy | ROC AUC | |
|---|---|---|
| 0 | 0.580000 | 0.579710 |
| 1 | 0.460000 | 0.620994 |
| 2 | 0.540000 | 0.553600 |
| 3 | 0.620000 | 0.678400 |
| 4 | 0.560000 | 0.613527 |
| 5 | 0.640000 | 0.765833 |
| 6 | 0.560000 | 0.652709 |
| 7 | 0.560000 | 0.643317 |
| 8 | 0.660000 | 0.734300 |
| 9 | 0.460000 | 0.651067 |
| 10 | 0.540000 | 0.600000 |
| 11 | 0.640000 | 0.728896 |
| 12 | 0.680000 | 0.649351 |
| 13 | 0.580000 | 0.491790 |
| 14 | 0.600000 | 0.696970 |
| 15 | 0.600000 | 0.603365 |
| 16 | 0.580000 | 0.597403 |
| 17 | 0.560000 | 0.706731 |
| 18 | 0.540000 | 0.590982 |
| 19 | 0.620000 | 0.698686 |
| 20 | 0.600000 | 0.660800 |
| 21 | 0.700000 | 0.790640 |
| 22 | 0.680000 | 0.770032 |
| 23 | 0.600000 | 0.788462 |
| 24 | 0.480000 | 0.328800 |
results = []
coefs = []
for train_index, test_index in tqdm(tss.split(X), total=tss.n_splits):
X_train = X.iloc[train_index].copy()
X_test = X.iloc[test_index].copy()
X_train, X_test = scale_and_dummy(X_train, X_test, df)
y_train = y[train_index]
y_test = y[test_index]
logr = LogisticRegressionCV(solver='liblinear')
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
y_prob = logr.predict_proba(X_test)[:,1]
df_test = df.iloc[test_index].copy()
df_test['home_prob'] = y_prob
df_test['away_prob'] = 1 - df_test.home_prob
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, kelly_scale=1)
results.append({
'C': logr.C_,
'Accuracy': accuracy,
'ROC AUC': roc_auc,
'Flat 1u PNL': flat_1u_pnl,
'Win 1u PNL': win_1u_pnl,
'Kelly PNL': kelly_pnl
})
coefs.append(
pd.DataFrame(
zip(logr.feature_names_in_, logr.coef_[0]),
columns=['feature', 'coef']
)
)
df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
print(f"Flat 1u PNL: {df_results['Flat 1u PNL'].sum()}")
print(f"Win 1u PNL: {df_results['Win 1u PNL'].sum()}")
print(f"Kelly PNL: {df_results['Kelly PNL'].sum()}")
display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
0%| | 0/25 [00:00<?, ?it/s]
Average Accuracy: 0.6008 Average ROC AUC: 0.6459448800634778 Flat 1u PNL: 63.31012164469311 Win 1u PNL: 70.8473185263898 Kelly PNL: 20.06039080559668
| C | Accuracy | ROC AUC | Flat 1u PNL | Win 1u PNL | Kelly PNL | |
|---|---|---|---|---|---|---|
| 0 | [0.00077426] | 0.560000 | 0.571659 | 3.179971 | 1.727962 | 0.825447 |
| 1 | [0.0001] | 0.500000 | 0.607372 | 8.410000 | 8.416261 | 1.998805 |
| 2 | [0.0001] | 0.460000 | 0.528000 | 6.550000 | 4.687297 | 1.932735 |
| 3 | [0.0001] | 0.580000 | 0.678400 | -4.380000 | -1.864863 | -1.348194 |
| 4 | [0.00599484] | 0.580000 | 0.587762 | 9.110084 | 6.294694 | 1.816271 |
| 5 | [0.00077426] | 0.680000 | 0.745000 | 0.339476 | 3.111434 | 0.682678 |
| 6 | [0.0001] | 0.660000 | 0.666667 | -1.897619 | -1.908672 | -0.698671 |
| 7 | [0.00077426] | 0.560000 | 0.631240 | 7.754195 | 5.508981 | 1.838118 |
| 8 | [0.00077426] | 0.660000 | 0.764895 | -7.397860 | -4.732032 | -1.713640 |
| 9 | [0.0001] | 0.580000 | 0.655172 | 4.939091 | 3.742878 | 1.392340 |
| 10 | [0.0001] | 0.540000 | 0.595200 | 0.262381 | 0.155511 | 0.068563 |
| 11 | [0.00077426] | 0.680000 | 0.728896 | 2.237104 | 5.560010 | 1.294972 |
| 12 | [0.04641589] | 0.600000 | 0.608766 | -3.787923 | -0.934046 | -1.156780 |
| 13 | [0.00599484] | 0.540000 | 0.512315 | 5.407887 | 1.059649 | 0.415893 |
| 14 | [0.00599484] | 0.620000 | 0.625668 | 9.004708 | 9.554287 | 2.976067 |
| 15 | [0.00077426] | 0.580000 | 0.604167 | 3.047963 | -0.515959 | 0.298963 |
| 16 | [0.00077426] | 0.600000 | 0.599026 | 11.010140 | 7.040508 | 2.460569 |
| 17 | [0.00077426] | 0.620000 | 0.714744 | -2.129691 | 2.967900 | 0.303268 |
| 18 | [0.00599484] | 0.580000 | 0.650564 | 12.687147 | 8.900983 | 4.069783 |
| 19 | [0.00077426] | 0.620000 | 0.722496 | 6.014087 | 7.815041 | 2.294935 |
| 20 | [0.00077426] | 0.600000 | 0.667200 | -6.274944 | -6.621247 | -1.603295 |
| 21 | [0.00077426] | 0.760000 | 0.784893 | -10.308423 | -4.192636 | -1.481131 |
| 22 | [0.00599484] | 0.720000 | 0.798077 | 0.022357 | 6.860426 | 1.188093 |
| 23 | [0.00077426] | 0.720000 | 0.777244 | -1.691465 | 4.307596 | 0.951462 |
| 24 | [0.00077426] | 0.420000 | 0.323200 | 11.201457 | 3.905358 | 1.253140 |
pd.concat(coefs, axis=0).groupby('feature').agg(
{'coef': ['mean', 'median', 'std']}
).sort_values(('coef', 'median'), key=abs, ascending=False).head()
| coef | |||
|---|---|---|---|
| mean | median | std | |
| feature | |||
| home_goal_diff_offense_str | -0.040254 | -0.026278 | 0.047158 |
| away_goal_diff_defense_str | -0.030086 | -0.018598 | 0.042356 |
| home_implied_prob | 0.019238 | 0.012753 | 0.026518 |
| away_implied_prob | -0.019514 | -0.012598 | 0.027504 |
| home_TwinSpires | -0.017955 | -0.012194 | 0.028835 |
dfs = []
game_dates = df.groupby(['season', 'game_date']).count().reset_index()[['season', 'game_date']]
kelly_bankroll = 100
all_game_dates = tqdm(game_dates.itertuples(), total=game_dates.shape[0])
for scope in all_game_dates:
y_train = y[(df.game_date < scope.game_date) & (df.season == scope.season)]
if (len(y_train) >= 16
and y_train[y_train == 0].shape[0] > 5
and y_train[y_train == 1].shape[0] > 5):
X_train = X[(df.game_date < scope.game_date) & (df.season == scope.season)].copy()
df_test = df[df.game_date == scope.game_date]
X_test = X.iloc[df_test.index].copy()
df_test = df_test[['season', 'game_date', 'home', 'away', 'home_goals', 'away_goals', 'home_win', 'home_consensus', 'away_consensus', 'home_implied_prob', 'away_implied_prob']]
df_test['away_win'] = np.where(df_test.home_win, 0, 1)
X_train, X_test = scale_and_dummy(X_train, X_test, df)
logr = LogisticRegression(C=.00077426, solver='liblinear')
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
y_prob = logr.predict_proba(X_test)[:,1]
df_test['home_prob'] = y_prob
df_test['away_prob'] = 1 - df_test.home_prob
flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, aggregated=False, kelly_scale=100)
df_test['flat_1u_pnl'] = flat_1u_pnl
df_test['win_1u_pnl'] = win_1u_pnl
df_test['kelly_pnl'] = kelly_pnl
kelly_bankroll += kelly_pnl.sum()
all_game_dates.set_description(f'{scope.game_date} | ${kelly_bankroll:.0f}')
dfs.append(df_test)
df_test = pd.concat(dfs)
# display(df_test)
0%| | 0/220 [00:00<?, ?it/s]
df_pnl = df_test.copy()
df_pnl['game_date'] = pd.to_datetime(df_pnl.game_date, format='%Y%m%d')
df_pnl['cum_flat_1u_pnl'] = df_pnl.groupby('season').flat_1u_pnl.transform('cumsum')
df_pnl['cum_win_1u_pnl'] = df_pnl.groupby('season').win_1u_pnl.transform('cumsum')
df_pnl['cum_kelly_pnl'] = df_pnl.groupby('season').kelly_pnl.transform('cumsum')
fig = px.area(df_pnl, x='game_date', y='cum_kelly_pnl', color='season')
fig.update_yaxes(title='Profit / Loss')
fig.update_xaxes(title='Date')